select * from products
where product_id in (
select product_id from orders
where order_date > '2023-01-01'
);
select distinct products.*
from products
inner join orders on
products.product_id = orders.product_id
where orders.order_date > '2023-01-01';
select customers.name
from customers
inner join orders on
customers.id = orders.customer_id
where customers.region = 'Europe';
pg_stat_all_tables
and
pg_stats
seq_page_cost
random_page_cost
cpu_tuple_cost
cpu_index_tuple_cost
pages
× seq_page_cost
+
rows
× cpu_tuple_cost
"rows" in cost formulas = estimated number of rows based on pg_stats
index_pages
× random_page_cost
+
rows
×
(cpu_index_tuple_cost
+ cpu_tuple_cost)
"rows" in cost formulas = estimated number of rows based on pg_stats
The optimizer always selects the best plan
explain
select *
from orders
where customer_id = 123;
explain analyze
select *
from orders
where customer_id = 123;
explain (analyze, verbose, buffer, wal, memory)
select *
from orders
where customer_id = 123;
Nested Loop (cost=4.65..26.59 rows=5 width=325) (actual time=0.025..0.032 rows=3 loops=1) Output: customers.id, customers.name, customers.email, orders.id, orders.order_date, orders.amount Buffers: shared hit=8 -> Index Scan using customers_pkey on public.customers (cost=0.29..8.30 rows=1 width=126) (actual time=0.012..0.013 rows=1 loops=1) Output: customers.id, customers.name, customers.email Index Cond: (customers.id = 123) Buffers: shared hit=3 -> Index Scan using orders_customer_id_idx on public.orders (cost=0.29..18.25 rows=5 width=199) (actual time=0.011..0.015 rows=3 loops=1) Output: orders.id, orders.order_date, orders.amount Index Cond: (orders.customer_id = 123) Buffers: shared hit=5 Planning Time: 0.195 ms Execution Time: 0.062 ms
Selective queries
select * from customers where id = 12345;
An index on customers.id will make that query faster (given there are enough rows)
Selective queries
select *
from orders
where order_date between '2023-01-01' and '2023-01-31';
An index on orders.order_date will make that query faster (given there are enough rows and not all orders are in January 2023.)
Join operations
select *
from orders o
inner join order_items i on o.order_id = i.order_id;
An index on orders.order_id will make that join faster (given there are enough rows)
Sort operations
select * from orders
where customer_id = 123
order by order_date desc;
An index on orders.order_date will make that sort faster
Write performance
update orders
set status = 'processed',
processing_date = current_date,
updated_at = now()
where order_date between '2023-01-01' and '2023-01-31'
and status = 'pending';
An index on order_date will make the query slower. Additional indexes on status, processing_date, updated_at will likely make the situation worse.
Low selectivity
select * from customers where status = 'active';
If the planner decides to use the index, the execution will likely be slower than scanning the whole table.
The Stack Overflow Trap
I have a slow query - just add an index!
Indexes drawback
create table products (
product_id serial primary key,
name text not null,
category text not null,
price decimal(10,2) not null,
in_stock boolean default true,
created_at timestamp default now()
);
create index idx_products_category_price
on products(category, price);
explain analyze
select * from products
where category = 'Electronics'
and price < 500.00;
Seq Scan on products (cost=0.00..169.00 rows=825 width=114)
(actual time=0.019..5.367 rows=812 loops=1)
Filter: ((cataegory = 'Electronics'::text)
AND (price < 500.00))
Rows Removed by Filter: 9588
Planning Time: 0.152 ms
Execution Time: 7.241 ms
D. There are too many matching rows so a sequential scan is actually faster
Rows Removed by Filter: 9588
Estimates: rows=825
Actual: rows=812
Total rows: 10400
Selectivity: ≈8% → Low
create table orders (
order_id integer primary key,
customer_id integer not null,
order_date date not null,
total_amount numeric(10,2) not null,
status text not null
);
create index idx_orders_order_id on orders(order_id);
explain analyze
select * from orders
where order_id = '12345';
Seq Scan on orders (cost=0.00..1830.50 rows=1 width=28)
(actual time=10.243..15.367 rows=1 loops=1)
Filter: ((order_id)::text = '12345'::text)
Rows Removed by Filter: 99999
Planning Time: 0.152 ms
Execution Time: 15.392 ms
where order_id = '12345'
into where order_id = 12345
order_id::text
ANALYZE
would fix the issueC. Create a new functional index on order_id::text
Technically true
But not the best answer
B. Rewrite where order_id = '12345'
into where
order_id = 12345
The index can't be used due to implicit cast
Remove the need for casting
create table products (
product_id serial primary key,
name text not null,
category text not null,
price decimal(10,2) not null,
in_stock boolean default true
);
create index idx_products_category on products(category);
explain analyze
select * from products
where category = 'Electronics';
Seq Scan on products (cost=0.00..20833.00
rows=900000 width=122)
(actual time=0.028..245.153 rows=50432 loops=1)
Filter: ((category)::text = 'Electronics'::text)
Rows Removed by Filter: 949568
Planning Time: 0.152 ms
Execution Time: 292.471 ms
The statistics are outdated, running ANALYZE should fix it
Estimates: rows=900000
Actual: rows=50432
There are 16 times less data than estimated
create table customers (
id serial primary key,
name text,
email text,
signup_date date,
last_login timestamptz,
profile_data jsonb
);
explain (analyze, buffers, costs)
select * from customers
order by last_login desc;
Sort (cost=20129.17..20379.17 rows=100000 width=512)
(actual time=845.332..945.364 rows=100000 loops=1)
Sort Key: customers.last_login DESC
Sort Method: external merge Disk: 68432kB
Buffers: shared hit=2584 read=345,
temp read=8554 written=8554
-> Seq Scan on public.customers i
(cost=0.00..5834.00 rows=100000 width=512)
(actual time=0.023..324.543 rows=100000 loops=1)
Buffers: shared hit=2584 read=345
Planning Time: 0.132 ms
Execution Time: 1045.402 ms
B. Increase the work_mem parameter
temp read=8554 written=8554"
Sort Method: external merge Disk: 68432kB
create or replace procedure process_pending_orders()
language plpgsql
as $process_pending_orders$
declare
order_cursor cursor for
select id from customer_orders
where status = 'pending'
and created_at < now() - interval '3 days';
order_id integer;
begin
open order_cursor;
loop
fetch order_cursor into order_id;
exit when not found;
update customer_orders
set status = 'processed', updated_at = now()
where id = order_id;
commit;
end loop;
close order_cursor;
end;
$process_pending_orders$;
CALL process_pending_orders();
Time: 374528.654 ms (06:14.529)
B. The COMMIT inside the loop is forcing transaction overhead for each update
Technicaly true
But there is a better answer
D. The procedure should use a single UPDATE statement instead of updating rows one at a time